1 Import Libraries

knitr::opts_chunk$set(echo = TRUE)
library(knitr) # for knitting markdown files
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2) # for plotting
library(broom)
library(reshape2)
#library(readr)
#library(readxl)
#library(Ecdat)
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
#library(plm)
#library(pwt9)
#library(quarto)
library(renv)
## 
## Attaching package: 'renv'
## The following objects are masked from 'package:stats':
## 
##     embed, update
## The following objects are masked from 'package:utils':
## 
##     history, upgrade
## The following objects are masked from 'package:base':
## 
##     autoload, load, remove
library(shiny)
## 
## Attaching package: 'shiny'
## The following object is masked from 'package:renv':
## 
##     isolate
library(targets)
library(testthat)
## 
## Attaching package: 'testthat'
## The following object is masked from 'package:targets':
## 
##     matches
## The following object is masked from 'package:dplyr':
## 
##     matches
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble  3.1.8     ✔ purrr   0.3.5
## ✔ tidyr   1.2.1     ✔ stringr 1.4.1
## ✔ readr   2.1.3     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::edition_get()   masks testthat::edition_get()
## ✖ dplyr::filter()        masks stats::filter()
## ✖ purrr::is_null()       masks testthat::is_null()
## ✖ dplyr::lag()           masks stats::lag()
## ✖ readr::local_edition() masks testthat::local_edition()
## ✖ tidyr::matches()       masks testthat::matches(), targets::matches(), dplyr::matches()
## ✖ purrr::modify()        masks renv::modify()
library(tibble)
library(lubridate)
## Loading required package: timechange
## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(purrr)
library(Hmisc) # for dependence tests
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## 
## The following object is masked from 'package:testthat':
## 
##     describe
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(plotly) # for interactive plots
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:Hmisc':
## 
##     subplot
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(hrbrthemes) 
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(xts) # for time series objects
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
#library(zoo)
library(seasonal) # for seasonality of time series
## 
## Attaching package: 'seasonal'
## 
## The following object is masked from 'package:tibble':
## 
##     view
library(tsbox)
library(forecast) # for forecasting time series
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tseries) # for unit root tests
#library(seasonalview)
#library(autoplotly)

library(tidyverse) # general
#library(ggalt) # dumbbell plots
library(plotly) #for drawing interactive plots
library(ggridges) #for drawing density gradient
library(shades) #edit colors in natural ways:
library(urca) 
library(tseries)
library(vars) # for VAR models
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:plotly':
## 
##     select
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Loading required package: strucchange
## Loading required package: sandwich
## 
## Attaching package: 'strucchange'
## 
## The following object is masked from 'package:stringr':
## 
##     boundary
## 
## Loading required package: lmtest
library(dynlm)
library(Metrics)
## 
## Attaching package: 'Metrics'
## 
## The following object is masked from 'package:forecast':
## 
##     accuracy
library(htmlTable) # for showing tables
#library(keras)
#library(tensorflow)
#install_keras()
#install_tensorflow(version = "nightly")
library(reticulate)
Sys.unsetenv("RETICULATE_PYTHON") 
use_virtualenv("~/.virtualenvs/r-reticulate")
library(keras)
colorize <- function(x, color) {
  if (knitr::is_latex_output()) {
    sprintf("\\textcolor{%s}{%s}", color, x)
  } else if (knitr::is_html_output()) {
    sprintf("<span style='color: %s;'>%s</span>", color,
      x)
  } else x
}

2 Introduction

2.1 Describe Dataset

The dataset used for this project is Daily Delhi Climate, which consists of the following columns:

  1. date: Date of format YYYY-MM-DD starting from “2013-01-01” and ending in “2017-01-01”.
  2. meantemp: Mean temperature averaged out from multiple 3 hour intervals in a day.
  3. humidity: Humidity value for the day (units are grams of water vapor per cubic meter volume of air).
  4. wind_speed: Wind speed measured in kmph.
  5. mean_pressure: Pressure reading of weather (measure in atm)

2.2 Goal and Procedure

The goal of this project is to analyze and forecast the mean temperature of Delhi, India, as well as wind speed of there. The two mentioned quantities are recorded in the meantemp column and wind_speed column respectively. As these columns are recorded in time, they are regarded as time series. For the ARIMA model, only meantemp is forecasted.

The following four forecasting models are used for this work: autoregressive–moving-average model (ARMA), vector autoregression (VAR), feedforward neural network (NN), and long-short term memory (LSTM) neural network. After importing the dataset, the data is processed in Preprocessing and Analysis Section. In this section, outliers are removed, time series objects are created from columns of dataframe, seasonality and trend are studied, and correlation plots are visualized. In Testing Stationary, the stationarity of the time series is checked using stationary tests. Four forecasting models are used Forecast Time Series section to forecast the two time series of meantemp and wind_speed. The ARMA model is used only for meantemp. In Compare Metrics, all the models are compared using different evaluation metrics.

df_train <- read_csv("data/DailyDelhiClimateTrain.csv")
## Rows: 1462 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (4): meantemp, humidity, wind_speed, meanpressure
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_test <- read_csv("data/DailyDelhiClimateTest.csv")
## Rows: 114 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (4): meantemp, humidity, wind_speed, meanpressure
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(df_train)
##       date               meantemp        humidity        wind_speed    
##  Min.   :2013-01-01   Min.   : 6.00   Min.   : 13.43   Min.   : 0.000  
##  1st Qu.:2014-01-01   1st Qu.:18.86   1st Qu.: 50.38   1st Qu.: 3.475  
##  Median :2015-01-01   Median :27.71   Median : 62.62   Median : 6.222  
##  Mean   :2015-01-01   Mean   :25.50   Mean   : 60.77   Mean   : 6.802  
##  3rd Qu.:2016-01-01   3rd Qu.:31.31   3rd Qu.: 72.22   3rd Qu.: 9.238  
##  Max.   :2017-01-01   Max.   :38.71   Max.   :100.00   Max.   :42.220  
##   meanpressure     
##  Min.   :  -3.042  
##  1st Qu.:1001.580  
##  Median :1008.563  
##  Mean   :1011.105  
##  3rd Qu.:1014.945  
##  Max.   :7679.333
df_train |> dplyr::glimpse()
## Rows: 1,462
## Columns: 5
## $ date         <date> 2013-01-01, 2013-01-02, 2013-01-03, 2013-01-04, 2013-01-…
## $ meantemp     <dbl> 10.000000, 7.400000, 7.166667, 8.666667, 6.000000, 7.0000…
## $ humidity     <dbl> 84.50000, 92.00000, 87.00000, 71.33333, 86.83333, 82.8000…
## $ wind_speed   <dbl> 0.0000000, 2.9800000, 4.6333333, 1.2333333, 3.7000000, 1.…
## $ meanpressure <dbl> 1015.667, 1017.800, 1018.667, 1017.167, 1016.500, 1018.00…
df_test |> summary()
##       date               meantemp        humidity       wind_speed    
##  Min.   :2017-01-01   Min.   :11.00   Min.   :17.75   Min.   : 1.387  
##  1st Qu.:2017-01-29   1st Qu.:16.44   1st Qu.:39.62   1st Qu.: 5.564  
##  Median :2017-02-26   Median :19.88   Median :57.75   Median : 8.069  
##  Mean   :2017-02-26   Mean   :21.71   Mean   :56.26   Mean   : 8.144  
##  3rd Qu.:2017-03-26   3rd Qu.:27.71   3rd Qu.:71.90   3rd Qu.:10.069  
##  Max.   :2017-04-24   Max.   :34.50   Max.   :95.83   Max.   :19.314  
##   meanpressure 
##  Min.   :  59  
##  1st Qu.:1007  
##  Median :1013  
##  Mean   :1004  
##  3rd Qu.:1017  
##  Max.   :1023
df_test |> dplyr::glimpse()
## Rows: 114
## Columns: 5
## $ date         <date> 2017-01-01, 2017-01-02, 2017-01-03, 2017-01-04, 2017-01-…
## $ meantemp     <dbl> 15.91304, 18.50000, 17.11111, 18.70000, 18.38889, 19.3181…
## $ humidity     <dbl> 85.86957, 77.22222, 81.88889, 70.05000, 74.94444, 79.3181…
## $ wind_speed   <dbl> 2.743478, 2.894444, 4.016667, 4.545000, 3.300000, 8.68181…
## $ meanpressure <dbl> 59.000, 1018.278, 1018.333, 1015.700, 1014.333, 1011.773,…

3 Visualize Data

Below we can see interactive plot of the meantemp time series.

p <- df_train |>
  ggplot( aes(x=date, y=meantemp)) +
    geom_area(fill="#69b3a2", alpha=0.5) +
    geom_line(color="#69b3a2") +
    ylab("bitcoin price ($)") +
    theme_ipsum()

# Turn it interactive with ggplotly
p <- ggplotly(p)
#p
p

The same plot for the wind_speed time series is also provided.

p <- df_train |>
  ggplot( aes(x=date, y=wind_speed)) +
    geom_area(fill="#69b3a2", alpha=0.5) +
    geom_line(color="#69b3a2") +
    ylab("bitcoin price ($)") +
    theme_ipsum()

# Turn it interactive with ggplotly
p <- ggplotly(p)
#p
p

4 Preprocessing

4.1 Treat Outliers: Manual

Evidenced by the plot of time series, an outlier at the last observation (last row of dataframe) can be detected. It causes an abrupt decrease in value of temperature. This would lead to problems in further analysis and forecasting models. Therefore, last observation’s value is replaced with its previous one. In Treat Outliers: Automatic, an automatic procedure of treating outliers is performed to remove further outliers.

previous_value <- df_train$meantemp[df_train$date == as.Date('2016-12-31')]

df_train$meantemp[df_train$date == as.Date('2017-01-01')]<- previous_value 
#df_train <- head(df_train, -1)
head(df_train)
## # A tibble: 6 × 5
##   date       meantemp humidity wind_speed meanpressure
##   <date>        <dbl>    <dbl>      <dbl>        <dbl>
## 1 2013-01-01    10        84.5       0           1016.
## 2 2013-01-02     7.4      92         2.98        1018.
## 3 2013-01-03     7.17     87         4.63        1019.
## 4 2013-01-04     8.67     71.3       1.23        1017.
## 5 2013-01-05     6        86.8       3.7         1016.
## 6 2013-01-06     7        82.8       1.48        1018
tail(df_train)
## # A tibble: 6 × 5
##   date       meantemp humidity wind_speed meanpressure
##   <date>        <dbl>    <dbl>      <dbl>        <dbl>
## 1 2016-12-27     16.8     67.6       8.34        1017.
## 2 2016-12-28     17.2     68.0       3.55        1016.
## 3 2016-12-29     15.2     87.9       6           1017.
## 4 2016-12-30     14.1     89.7       6.27        1018.
## 5 2016-12-31     15.1     87         7.32        1016.
## 6 2017-01-01     15.1    100         0           1016

The plot of data after removing the outlier is visualized below:

p <- df_train |>
  ggplot( aes(x=date, y=meantemp)) +
    geom_area(fill="#69b3a2", alpha=0.5) +
    geom_line(color="#69b3a2") +
    ylab("bitcoin price ($)") +
    theme_ipsum()

# Turn it interactive with ggplotly
p <- ggplotly(p)
#p
p

In below we find if there is any missing date

date_range <- seq(min(df_train$date), max(df_train$date), by = 1) 
date_range[!date_range %in% df_train$date] 
## Date of length 0

4.2 Construct Time Series

Thee meantemp column is used to create time series data. The time series is assigned to xts objects. But since many functions later require ts object, each time a xts object is defined, a ts counterpart is created for further possible use. The conversion from xts to ts is performed using txbox::ts_ts function.

min(df_train$date)
## [1] "2013-01-01"
max(df_train$date)
## [1] "2017-01-01"
#ts_train <- zoo(df_train$meantemp, df_train$date)

xts_train_meantemp <- xts(df_train$meantemp, order.by=df_train$date, "%Y-%m-%d")
class(xts_train_meantemp)
## [1] "xts" "zoo"
head(xts_train_meantemp)
##                 [,1]
## 2013-01-01 10.000000
## 2013-01-02  7.400000
## 2013-01-03  7.166667
## 2013-01-04  8.666667
## 2013-01-05  6.000000
## 2013-01-06  7.000000
tail(xts_train_meantemp)
##                [,1]
## 2016-12-27 16.85000
## 2016-12-28 17.21739
## 2016-12-29 15.23810
## 2016-12-30 14.09524
## 2016-12-31 15.05263
## 2017-01-01 15.05263
# convert xts to ts

## Create a daily Date object for ts
#inds <- seq(as.Date("2013-01-01"), as.Date("2017-01-01"), by = "day")

#set.seed(25)
#ts_train <- ts(df_train$meantemp,     # random data
#           start = c(2013, as.numeric(format(inds[1], "%j"))),
#           frequency = 365)


#ts_train <- ts(df_train$meantemp, start = decimal_date(ymd("2013-01-01")), frequency = 365.25 / 7)

Convert xts objects to ts objects:

ts_train_meantemp <- ts_ts(xts_train_meantemp)
head(ts_train_meantemp)
## Time Series:
## Start = 2013 
## End = 2013.01368953503 
## Frequency = 365.2425 
## [1] 10.000000  7.400000  7.166667  8.666667  6.000000  7.000000
tail(ts_train_meantemp)
## Time Series:
## Start = 2016.98639260218 
## End = 2017.00008213721 
## Frequency = 365.2425 
## [1] 16.85000 17.21739 15.23810 14.09524 15.05263 15.05263

Static plot is show in the following:

ts_plot(xts_train_meantemp)

4.3 Treat Outliers: Automatic

The tsclean function identifies and replace outliers and missing values in a time series. It uses Friedman’s Super Smoother estimator for non-seasonal series and a robust STL decomposition for seasonal series. To estimate missing values and outlier replacements, linear interpolation is used on the (possibly seasonally adjusted) series. In our case, the robust STL decomposition is used due to existing of seasonalities.

ts_plot(xts_train_meantemp)

xts_train_meantemp <- tsclean(xts_train_meantemp)
ts_plot(xts_train_meantemp)

I speculate that since there was not high fluctuations in the time series, there was no change. However, for instance in the wind_speed column, there are salient changes when using tsclean, as evident when applying in VAR.

4.4 Seasonality

From the initial plot I judge that there is seasonality. For more delicate observation to find if there is more granular periods of seasonality, I use seasonality plots. Before that, I aggregate data weekly, monthly, and quarterly.

4.4.1 Seasonality Plots

# Weekly mean temperature
xts_week_train_meantemp <- apply.weekly(xts_train_meantemp,sum)
ts_week_train_meantemp <-ts_ts(xts_week_train_meantemp)

# Monthly mean temperature
xts_mon_train_meantemp <- aggregate(xts_train_meantemp, by=as.yearmon, FUN=sum)
ts_mon_train_meantemp <-ts_ts(xts_mon_train_meantemp)

# Quarterly mean temperature
xts_quar_train_meantemp <- aggregate(xts_train_meantemp, as.yearqtr, FUN=sum)
ts_quar_train_meantemp <-ts_ts(xts_quar_train_meantemp)


# Yearly mean temperate
as.year <- function(x) as.integer(as.yearmon(x))
xts_year_train_meantemp <- aggregate(xts_train_meantemp, by=as.year, FUN=sum)
#ts_year_train_meantemp <-ts_ts(xts_year_train_meantemp)
#xts_year_train_meantemp[1]

The year 2017 has only one observation, so it is removed it from all the aggregated datasets. I couldn’t do it before aggregating, otherwise I would have confronted the error Error: series has no regular pattern.

xts_week_train_meantemp <- head(xts_week_train_meantemp, -1)
xts_mon_train_meantemp <- head(xts_mon_train_meantemp, -1)
xts_quar_train_meantemp <- head(xts_quar_train_meantemp, -1)

ts_week_train_meantemp <- head(ts_week_train_meantemp, -1)
ts_mon_train_meantemp <- head(ts_mon_train_meantemp, -1)
ts_quar_train_meantemp <- head(ts_quar_train_meantemp, -1)
#options(repr.plot.width = 7, repr.plot.height =20)
forecast::ggseasonplot(ts_mon_train_meantemp, year.labels=TRUE, year.labels.left=TRUE, labelgap = 0.1) +
  ylab("degree") +
  ggtitle("Seasonal plot: Monthly Mean Temperature")

forecast::ggseasonplot(ts_mon_train_meantemp, year.labels=TRUE, year.labels.left=TRUE, labelgap = 0.1, polar=TRUE) +
  ylab("degree") +
  ggtitle("Polar Seasonal plot: Monthly Mean Temperature")

#options(repr.plot.width = 7, repr.plot.height =20)
forecast::ggseasonplot(ts_quar_train_meantemp, year.labels=TRUE, year.labels.left=TRUE, labelgap = 0.1) +
  ylab("degree") +
  ggtitle("Seasonal plot: Quarterly Mean Temperature")

forecast::ggseasonplot(ts_quar_train_meantemp, year.labels=TRUE, year.labels.left=TRUE, labelgap = 0.1, polar=TRUE) +
  ylab("degree") +
  ggtitle("Polar Seasonal plot: Quarterly Mean Temperature")

Judging from the plots, all months seem to have seasonalities, with stronger ones in second and third quarters of each year.

4.4.2 Deseasonalize

If one intends to remove different periods of seasonality together, he can use the forecast::msts function. For instance in below, weekly and yearly seasonality are removed together.

des_ts_train_meantemp <- msts(xts_train_meantemp,seasonal.periods = c(7,365))
#head(des_xts_train)
#library(tsbox)
#ts_train <-ts_ts(xts_train)
#ts_train

class(des_ts_train_meantemp)
## [1] "msts" "ts"

The output of msts had an a peculiar shape, and it is not using the state-of-the-art X13 decomposition. To address these limitations, I incorporated the X-13ARIMA-SEATS using seasonal:seas function. it has some of its own limitations, as stated in the package’s reference manua. For instance, the number of observations must not exceed 780, nor should maximum seasonal period exceed 12. That is why I couldn’t use original data ts_train and also the weekly aggregated data ts_week_train, as I would confront the error Seasonal period too large. The only possible aggregated data with highest frequency possible was monthly aggregated, ts_mon_train. However, I am concerned that I would lose significant pattern and information with this amount of aggregation.

length(xts_train_meantemp)
## [1] 1462
length(ts_train_meantemp)
## [1] 1462
length(xts_train_meantemp)
## [1] 1462
nowXTS <-ts_xts(ts_train_meantemp)
length(nowXTS)
## [1] 1462
length(ts_week_train_meantemp)
## [1] 208
plot(ts_week_train_meantemp)

length(ts_week_train_meantemp)
## [1] 208
plot(ts_train_meantemp)

length(ts_train_meantemp)
## [1] 1462
plot(ts_mon_train_meantemp)

length(ts_mon_train_meantemp)
## [1] 48
m <- seas(ts_mon_train_meantemp)
ts_train_adj_meantemp <- final(m)
#ts_train_adj
length(ts_train_adj_meantemp)
## [1] 48
m <- seas(ts_mon_train_meantemp)
ts_train_adj_meantemp <- final(m)
#ts_train_adj
length(ts_train_adj_meantemp)
## [1] 48
plot(ts_train_adj_meantemp)

Plot original data along with trend and seasonally adjusted data

#ts_train
#series(m, "forecast.forecasts")
#out(m)
#seasadj(m)
autoplot(ts_mon_train_meantemp, series="Original Data") +
autolayer(trendcycle(m), series="Trend") +
autolayer(seasadj(m), series="Seasonally Adjusted") +
xlab("Year") + ylab("Mean Temperature") +
ggtitle("Mean Temperature Decomposed using X13") +
scale_colour_manual(values=c("gray","blue","red"),
           breaks=c("Original Data","Seasonally Adjusted","Trend"))

#ap < ggplotly(ap)

4.5 Detrend

In the seasonally adjusted time series ts_train_adj, a trend is salient. It can be removed by differencing as follows:

#ts_train_adj_meantemp |> log() |> nsdiffs(alpha=0.01) -> ts_train_adj_det_meantemp
ts_train_adj_meantemp |> log() |> diff() -> ts_train_adj_det_meantemp
plot(ts_train_adj_det_meantemp)

#plot(d)

4.6 Correlation Plots

In the following the autocorrelation function (AC) and partial autocorrelation function (PACF) are visualized for both unadjusted and adjusted verions of time series.

  1. Weekly aggregated of original time series
ggAcf(ts_week_train_meantemp, lag=50)

pacf (ts_week_train_meantemp, lag=50, pl = TRUE)

  1. Seasonally Adjusted
ggAcf(ts_train_adj_meantemp, lag=10)

pacf (ts_train_adj_meantemp, lag=10, pl = TRUE)

  1. Seasonally Adjusted and Detrended
ggAcf(ts_train_adj_det_meantemp, lag=10)

pacf (ts_train_adj_det_meantemp, lag=10, pl = TRUE)

4.7 Prepare Test Set

In section Preprocessing and Analysis Section, all the preprocessing steps are applied on the training dataset. In the following, same processes are applied on the test dataset.

summary(df_test)
##       date               meantemp        humidity       wind_speed    
##  Min.   :2017-01-01   Min.   :11.00   Min.   :17.75   Min.   : 1.387  
##  1st Qu.:2017-01-29   1st Qu.:16.44   1st Qu.:39.62   1st Qu.: 5.564  
##  Median :2017-02-26   Median :19.88   Median :57.75   Median : 8.069  
##  Mean   :2017-02-26   Mean   :21.71   Mean   :56.26   Mean   : 8.144  
##  3rd Qu.:2017-03-26   3rd Qu.:27.71   3rd Qu.:71.90   3rd Qu.:10.069  
##  Max.   :2017-04-24   Max.   :34.50   Max.   :95.83   Max.   :19.314  
##   meanpressure 
##  Min.   :  59  
##  1st Qu.:1007  
##  Median :1013  
##  Mean   :1004  
##  3rd Qu.:1017  
##  Max.   :1023
df_test |> describe()
## df_test 
## 
##  5  Variables      114  Observations
## --------------------------------------------------------------------------------
## date 
##          n    missing   distinct       Info       Mean        Gmd        .05 
##        114          0        114          1 2017-02-26      38.33 2017-01-06 
##        .10        .25        .50        .75        .90        .95 
## 2017-01-12 2017-01-29 2017-02-26 2017-03-26 2017-04-12 2017-04-18 
## 
## lowest : 2017-01-01 2017-01-02 2017-01-03 2017-01-04 2017-01-05
## highest: 2017-04-20 2017-04-21 2017-04-22 2017-04-23 2017-04-24
## --------------------------------------------------------------------------------
## meantemp 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      114        0      105        1    21.71    7.226    13.22    14.75 
##      .25      .50      .75      .90      .95 
##    16.44    19.88    27.71    31.00    32.67 
## 
## lowest : 11.00000 11.72222 11.78947 12.11111 13.04167
## highest: 32.90000 33.50000 34.00000 34.25000 34.50000
## --------------------------------------------------------------------------------
## humidity 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      114        0      109        1    56.26    21.97    26.74    29.49 
##      .25      .50      .75      .90      .95 
##    39.62    57.75    71.90    78.42    82.20 
## 
## lowest : 17.75000 19.42857 21.12500 24.12500 26.00000
## highest: 83.52632 84.44444 85.86957 91.64286 95.83333
## --------------------------------------------------------------------------------
## wind_speed 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      114        0      109        1    8.144    4.031    2.842    3.715 
##      .25      .50      .75      .90      .95 
##    5.564    8.069   10.069   13.464   14.353 
## 
## lowest :  1.387500  1.625000  1.854545  1.950000  2.100000
## highest: 14.384615 15.512500 16.662500 17.590000 19.314286
## --------------------------------------------------------------------------------
## meanpressure 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      114        0      109        1     1004    23.16     1002     1004 
##      .25      .50      .75      .90      .95 
##     1007     1013     1017     1019     1021 
## 
## lowest :   59.000  998.625  999.875 1000.875 1001.600
## highest: 1021.375 1021.556 1021.789 1021.958 1022.810
##                                   
## Value         60  1000  1010  1020
## Frequency      1    14    53    46
## Proportion 0.009 0.123 0.465 0.404
## 
## For the frequency table, variable is rounded to the nearest 10
## --------------------------------------------------------------------------------
xts_test_meantemp <- xts(df_test$meantemp, order.by=df_test$date, "%Y-%m-%d")
head(xts_test_meantemp)
##                [,1]
## 2017-01-01 15.91304
## 2017-01-02 18.50000
## 2017-01-03 17.11111
## 2017-01-04 18.70000
## 2017-01-05 18.38889
## 2017-01-06 19.31818
tail(xts_test_meantemp)
##              [,1]
## 2017-04-19 33.500
## 2017-04-20 34.500
## 2017-04-21 34.250
## 2017-04-22 32.900
## 2017-04-23 32.875
## 2017-04-24 32.000
ts_plot(xts_test_meantemp)

ts_test_meantemp <- ts_ts(xts_test_meantemp)
xts_week_test_meantemp <- apply.weekly(xts_test_meantemp,sum)
ts_week_test_meantemp <- na.remove(ts_ts(xts_week_test_meantemp))
#ts_week_test_meantemp <- as.ts(xts_week_test_meantemp)
length(ts_week_test_meantemp)
## [1] 18
ts_plot(xts_week_test_meantemp)

5 Testing Stationarity: Unit Root Tests

5.1 ADF

In Augmented Dicky Fuller (ADF) test, the null hypothesis is \(H_0\): there is a unit root (equivalently, is a non-stationary time series), while the alternate hypothesis is \(H_1\): time series is stationary. DF test is valid if the time series is well characterized by an \(AR(1)\) model with noise errors. ADF test unlike DF test can be applied on a large sized set of time series models. For this reason, it was preferred in this work over DF test.

  1. Original Time Series and its Weekly Adjusted
ts_train_meantemp |> adf.test()
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_train_meantemp
## Dickey-Fuller = -1.9871, Lag order = 11, p-value = 0.5838
## alternative hypothesis: stationary
ts_week_train_meantemp |> adf.test()
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_week_train_meantemp
## Dickey-Fuller = -3.8729, Lag order = 5, p-value = 0.01651
## alternative hypothesis: stationary
  1. Seasonally Adjusted
ts_train_adj_meantemp |> adf.test() 
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_train_adj_meantemp
## Dickey-Fuller = -2.5897, Lag order = 3, p-value = 0.3386
## alternative hypothesis: stationary
  1. Seasonally Adjusted and Detrended
ts_train_adj_det_meantemp |> adf.test() 
## Warning in adf.test(ts_train_adj_det_meantemp): p-value smaller than printed p-
## value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_train_adj_det_meantemp
## Dickey-Fuller = -4.698, Lag order = 3, p-value = 0.01
## alternative hypothesis: stationary

In the ADF test, smaller test statistics indicates more likelihood of the null-hypothesis (time series is not stationary) be true. If we set threshold to be 0.05, the p-values less than this value imply null hypothesis is unlikely to be true, and we can increase our certainty in alternate hypothesis (time series is stationary). For all the investigated datasets, results are reported in the following:

  1. Original time series: non-stationary
  2. Seasonally adjusted: non-stationary
  3. Seasonally adjusted and detrended: stationary
  4. Weekly aggregated time series: stationary

5.2 KPSS

In Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test, the null hypothesis is \(H_0\): time series is stationary (level or trend), while the alternate hypothesis is \(H_1\): time series is non-stationary.

  1. Original Time Series and also its weekly aggregated
ts_train_meantemp |>  kpss.test()
## 
##  KPSS Test for Level Stationarity
## 
## data:  ts_train_meantemp
## KPSS Level = 0.58122, Truncation lag parameter = 7, p-value = 0.02434
ts_week_train_meantemp |>  kpss.test()
## Warning in kpss.test(ts_week_train_meantemp): p-value greater than printed p-
## value
## 
##  KPSS Test for Level Stationarity
## 
## data:  ts_week_train_meantemp
## KPSS Level = 0.16182, Truncation lag parameter = 4, p-value = 0.1
  1. Seasonally Adjusted
ts_train_adj_meantemp |>  kpss.test()
## Warning in kpss.test(ts_train_adj_meantemp): p-value smaller than printed p-
## value
## 
##  KPSS Test for Level Stationarity
## 
## data:  ts_train_adj_meantemp
## KPSS Level = 0.99735, Truncation lag parameter = 3, p-value = 0.01
  1. Seasonally Adjusted and Detrended
ts_train_adj_det_meantemp |> kpss.test()
## Warning in kpss.test(ts_train_adj_det_meantemp): p-value greater than printed p-
## value
## 
##  KPSS Test for Level Stationarity
## 
## data:  ts_train_adj_det_meantemp
## KPSS Level = 0.059493, Truncation lag parameter = 3, p-value = 0.1

For all the investigated datasets, results are reported in the following:

  1. Original time series: non-stationary
  2. Seasonally adjusted: stationary
  3. Seasonally adjusted and detrended: non-stationary
  4. Weekly aggregated time series: non-stationary

6 Forecasting

6.1 SARIMA

The seasonal autoregressive integrated moving average (SARIMA) is used in the following to forecast time series. The auto.arima function is used to find the following parameters, in which (p, d, q) correspond to non-seasonal component of time series, whereas (P, D, Q) correspond to seasonal component.

  • p: Auto-regressive lag order
  • d: Order of first-differencing
  • q: Moving average lag order
  • P: Auto-regressive lag order
  • D: Order of seasonal-differencing
  • Q: Moving average lag order

The function returns the best ARIMA model according to either AIC. Since the runtime of the function is very long, the best parameters found are stored, and then they are used as both minimum and maximum option, although the auto.arima function nonetheless tries some initial fixed orders before using the predefined lags. Therefore, the options are reduced but not to 1 combination.

  1. Forecast original time series of meantemp, as the original data has very high frequency, which makes it unsuitable for ARMA. For this case, I set seasonal=TRUE, as in subsequent cases I use data that I seasonally adjusted them already. Setting seasonal=TRUE makes the model more time-consuming.
forecast_ts_train_meantemp <- auto.arima(ts_train_meantemp,
                            d = 1,
                            D = 1,
                            start.p = 2,
                            start.q = 3,
                            max.p = 2,
                            max.q = 3,
                            start.P = 0,
                            start.Q = 0,
                            max.P = 0,
                            max.Q = 0,
                            trace = TRUE, 
                            seasonal=TRUE,
                            stepwise=TRUE,
                            approximation=FALSE
                            #,xreg = xreg_matrix
                            )
## 
##  ARIMA(2,1,3)(0,1,0)[365]                    : 4789.495
##  ARIMA(0,1,0)(0,1,0)[365]                    : 4943.724
##  ARIMA(1,1,0)(0,1,0)[365]                    : 4926.887
##  ARIMA(0,1,1)(0,1,0)[365]                    : 4920.711
##  ARIMA(1,1,3)(0,1,0)[365]                    : 4789.51
##  ARIMA(2,1,2)(0,1,0)[365]                    : Inf
##  ARIMA(1,1,2)(0,1,0)[365]                    : 4790.218
## 
##  Best model: ARIMA(2,1,3)(0,1,0)[365]
checkresiduals(forecast_ts_train_meantemp)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,3)(0,1,0)[365]
## Q* = 363.76, df = 287, p-value = 0.001427
## 
## Model df: 5.   Total lags used: 292
forecast_ts_train_meantemp
## Series: ts_train_meantemp 
## ARIMA(2,1,3)(0,1,0)[365] 
## 
## Coefficients:
##          ar1     ar2      ma1      ma2      ma3
##       0.0104  0.4362  -0.2760  -0.6134  -0.0888
## s.e.  0.3402  0.2577   0.3395   0.3585   0.0480
## 
## sigma^2 = 4.561:  log likelihood = -2388.71
## AIC=4789.42   AICc=4789.5   BIC=4819.41
  1. Forecast original time series of meantemp but aggregated weekly, as the original data has very high frequency, which makes it unsuitable for ARMA. For this case, I set seasonal=TRUE, as in case 3, I use data that I seasonally adjusted them already. Setting seasonal=TRUE makes the model more time-consuming.
forecast_ts_week_train_meantemp = auto.arima(ts_week_train_meantemp, 
                                             d = 1, 
                                             D = 1, 
                                             start.p = 4, 
                                             start.q = 0, 
                                             max.p = 4, 
                                             max.q = 0, 
                                             start.P = 1, 
                                             start.Q = 0, 
                                             max.P = 1, 
                                             max.Q = 0, 
                                             trace = TRUE,  
                                             seasonal=TRUE, 
                                             stepwise=FALSE, 
                                             approximation=FALSE)
## 
##  ARIMA(0,1,0)(0,1,0)[52]                    : 1348.451
##  ARIMA(0,1,0)(1,1,0)[52]                    : 1317.269
##  ARIMA(1,1,0)(0,1,0)[52]                    : 1333.282
##  ARIMA(1,1,0)(1,1,0)[52]                    : 1303.962
##  ARIMA(2,1,0)(0,1,0)[52]                    : 1325.56
##  ARIMA(2,1,0)(1,1,0)[52]                    : 1295.714
##  ARIMA(3,1,0)(0,1,0)[52]                    : 1317.794
##  ARIMA(3,1,0)(1,1,0)[52]                    : 1286.975
##  ARIMA(4,1,0)(0,1,0)[52]                    : 1299.762
##  ARIMA(4,1,0)(1,1,0)[52]                    : 1269.65
## 
## 
## 
##  Best model: ARIMA(4,1,0)(1,1,0)[52]
checkresiduals(forecast_ts_week_train_meantemp)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(4,1,0)(1,1,0)[52]
## Q* = 33.201, df = 37, p-value = 0.6478
## 
## Model df: 5.   Total lags used: 42
forecast_ts_week_train_meantemp
## Series: ts_week_train_meantemp 
## ARIMA(4,1,0)(1,1,0)[52] 
## 
## Coefficients:
##           ar1      ar2      ar3      ar4     sar1
##       -0.5755  -0.5066  -0.4439  -0.3623  -0.5049
## s.e.   0.0786   0.0849   0.0844   0.0792   0.0746
## 
## sigma^2 = 181.4:  log likelihood = -628.54
## AIC=1269.08   AICc=1269.65   BIC=1287.34
  1. Forecast deseasonalized time series
forecast_ts_train_adj_meantemp = auto.arima(ts_train_adj_meantemp, 
                                            trace = TRUE,  
                                            seasonal= FALSE, 
                                            stepwise=FALSE, 
                                            approximation=FALSE, 
                                            d = 1, 
                                            start.p = 0, 
                                            start.q = 1, 
                                            max.p = 0, 
                                            max.q = 1)
## 
##  ARIMA(0,1,0)                               : 441.3883
##  ARIMA(0,1,0)            with drift         : 443.1154
##  ARIMA(0,1,1)                               : 429.9577
##  ARIMA(0,1,1)            with drift         : 429.5371
## 
## 
## 
##  Best model: ARIMA(0,1,1)            with drift
checkresiduals(forecast_ts_train_adj_meantemp)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1) with drift
## Q* = 7.4191, df = 9, p-value = 0.5936
## 
## Model df: 1.   Total lags used: 10
forecast_ts_train_adj_meantemp
## Series: ts_train_adj_meantemp 
## ARIMA(0,1,1) with drift 
## 
## Coefficients:
##           ma1   drift
##       -0.6827  1.9880
## s.e.   0.1279  1.0526
## 
## sigma^2 = 488.7:  log likelihood = -211.49
## AIC=428.98   AICc=429.54   BIC=434.53
  1. Forecast deseasonalized and detrended time series
forecast_ts_train_adj_det_meantemp = auto.arima(ts_train_adj_det_meantemp, 
                                                trace = TRUE,
                                                seasonal= FALSE, 
                                                stepwise=FALSE, 
                                                approximation=FALSE,
                                                d = 0, 
                                                start.p = 0, 
                                                start.q = 1, 
                                                max.p = 0, 
                                                max.q = 1)
## 
##  ARIMA(0,0,0)            with zero mean     : -183.1965
##  ARIMA(0,0,0)            with non-zero mean : -181.4467
##  ARIMA(0,0,1)            with zero mean     : -195.2457
##  ARIMA(0,0,1)            with non-zero mean : -195.724
## 
## 
## 
##  Best model: ARIMA(0,0,1)            with non-zero mean
checkresiduals(forecast_ts_train_adj_det_meantemp)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,1) with non-zero mean
## Q* = 7.1212, df = 8, p-value = 0.5236
## 
## Model df: 1.   Total lags used: 9
#checkresiduals(forecast_ts_train_meantemp)
forecast_ts_train_adj_det_meantemp
## Series: ts_train_adj_det_meantemp 
## ARIMA(0,0,1) with non-zero mean 
## 
## Coefficients:
##           ma1    mean
##       -0.6984  0.0025
## s.e.   0.1247  0.0013
## 
## sigma^2 = 0.0008149:  log likelihood = 101.14
## AIC=-196.28   AICc=-195.72   BIC=-190.73

6.1.1 Evaluate

Based on the results from the forecast of original data (case 1), we have:

AIC_ARMA <- AIC(forecast_ts_train_meantemp)
AIC_ARMA
## [1] 4789.418
BIC_ARMA <- BIC(forecast_ts_train_meantemp)
BIC_ARMA
## [1] 4819.414

The following evaluation metrics between prediction and test data are manually computed: RMSE, MAE, \(R^2\) score.

forecast <- forecast_ts_train_meantemp |> forecast(h=114)
#forecast
predicted <- as.numeric(forecast$mean)
actual <- as.numeric(ts_test_meantemp)
RMSE_ARMA <- rmse(predicted, actual)
RMSE_ARMA
## [1] 4.298832
MAE_ARMA <- mae(predicted, actual)
MAE_ARMA
## [1] 3.575725
rsq <- function (x, y) cor(x, y) ^ 2

RSQ_ARMA <- rsq(actual, predicted)

RSQ_ARMA
## [1] 0.8174112

Two tables will be presented, one of which reporting metrics of the model applied on training set, and the other reporting metrics for evaluating predictions based on test set.

d <- cbind(AIC = AIC_ARMA, BIC = BIC_ARMA)
# at most 4 decimal places
knitr::kable(d, digits = 4)
AIC BIC
4789.418 4819.414
d <- cbind(R2 = RSQ_ARMA, RMSE = RMSE_ARMA, MAE = MAE_ARMA)
# at most 4 decimal places
knitr::kable(d, digits = 4)
R2 RMSE MAE
0.8174 4.2988 3.5757

6.1.2 Plot Forecast

  1. Original time series of meantemp
autoplot(forecast(forecast_ts_train_meantemp))# + autolayer(xts_test_meantemp)

length(ts_test_meantemp)
## [1] 114
forecast_ts_train_meantemp |> forecast(h=114) |>
autoplot() + autolayer(ts_test_meantemp)

  1. Original time series of meantemp but aggregated weekly
#autoplot(forecast(ts_week_train_meantemp)) #+ autolayer(ts_week_test_meantemp)
  1. Deseasonalized time series
#forecast_ts_train_adj + ts_train_adj
autoplot(forecast(forecast_ts_train_adj_meantemp))

  1. Deseasonalized and detrended time series
autoplot(forecast(forecast_ts_train_adj_det_meantemp))

The plot of forecasting the test data (using forecast from case 1) joint with test data is displayed in the following:

#ts_plot(ts_test_meantemp, forecast$mean)
#ts.union(ts_test_meantemp, forecast$mean)
#forecast$mean

xts_temp <- xts(ts_test_meantemp, order.by=df_test$date, "%Y-%m-%d")
xts_temp_2 <- xts(forecast$mean, order.by=df_test$date, "%Y-%m-%d")
#xts_temp
#xts_temp_2
ts_plot(xts_temp, xts_temp_2)

I confronted some issue adding the seasonality and trend componend of the adjusted versions to the original time series. Notwithstanding, judging by the results, the processed but unadjusted original data (without deseaosnalisation and without detrending) was a proper input to the ARMA model, as the function handles seasonalities by itself, and the results demonstrate a promising forecasting performance. Therefore, henceforth the processed time series but with no adjustment are used for the subsequent forecasting parts of this work, as subsequent models also handle seasonality.

6.2 Vector autoregressive (VAR)

The vector autoregressive can be used to model the multivariate time series consisting of both the columns meantemp and wind_speed.

The wind_speed is plotted interactively in the following:

p2 <- df_train |>
  ggplot( aes(x=date, y=wind_speed)) +
    geom_area(fill="#69b3a2", alpha=0.5) +
    geom_line(color="#69b3a2") +
    ylab("bitcoin price ($)") +
    theme_ipsum()

# Turn it interactive with ggplotly
p2 <- ggplotly(p2)
#p
p2

Same as Construct Time Series, a time series object is constructed from the wind_speed column.

#xts_train_meantemp <- xts(df_train$meantemp, order.by=df_train$date, "%Y-%m-%d")
#ts_train_meantemp <-ts_ts(xts_train_meantemp)

xts_train_windspeed <- xts(df_train$wind_speed, order.by=df_train$date, "%Y-%m-%d")
ts_train_windspeed <-ts_ts(xts_train_windspeed)

A static plot of time series is provided below:

ts_plot(ts_train_windspeed)

The outliers of time series are treated automatically, same as Treat Outliers: Automatic.

xts_train_windspeed <- tsclean(xts_train_windspeed)

The plot of the resulted time series is visualized below:

ts_plot(xts_train_windspeed)

The test data for wind_speed is created and processed using the same procedure followed for train data:

xts_test_windspeed <- xts(df_test$wind_speed, order.by=df_test$date, "%Y-%m-%d")
head(xts_test_windspeed)
##                [,1]
## 2017-01-01 2.743478
## 2017-01-02 2.894444
## 2017-01-03 4.016667
## 2017-01-04 4.545000
## 2017-01-05 3.300000
## 2017-01-06 8.681818
tail(xts_test_windspeed)
##                [,1]
## 2017-04-19  9.02500
## 2017-04-20  5.56250
## 2017-04-21  6.96250
## 2017-04-22  8.89000
## 2017-04-23  9.96250
## 2017-04-24 12.15714
ts_plot(xts_test_windspeed)

xts_test_windspeed <- tsclean(xts_test_windspeed)
ts_test_windspeed <- ts_ts(xts_test_windspeed)
ts_plot(xts_test_windspeed)

In what follows, interactive plot of both time series are illustrated:

fig <- plot_ly(df_train, type = 'scatter', mode = 'lines')%>%
  add_trace(x = ~date, y = ~meantemp, name = 'MeanTemp')%>%
  add_trace(x = ~date, y = ~wind_speed, name = 'WindSpeed')%>%
  layout(title = 'custom tick labels',legend=list(title=list(text='variable')),
         xaxis = list(dtick = "M1", tickformat= "%b\n%Y"), width = 2000)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
options(warn = -1)
fig <- fig %>%
  layout(
         xaxis = list(zerolinecolor = '#ffff',
                      zerolinewidth = 2,
                      gridcolor = 'ffff',  tickangle = 0),
         yaxis = list(zerolinecolor = '#ffff',
                      zerolinewidth = 2,
                      gridcolor = 'ffff'),
         plot_bgcolor='#e5ecf6')


fig

Weekly aggregated time series are also constructed:

# Weekly mean temperature
xts_week_train_windspeed <- apply.weekly(xts_train_windspeed, sum)
ts_week_train_windspeed <- ts_ts(xts_week_train_windspeed)

xts_week_test_windspeed <- apply.weekly(xts_test_windspeed, sum)
ts_week_test_windspeed <- na.remove(ts_ts(xts_week_test_windspeed))
#ts_week_test_windspeed <- as.ts(xts_week_test_windspeed)

The time series of wind_speed is visualized in the following:

ts_plot(xts_train_windspeed)

Both time series (meantemp and wind_speed) are merged and then fed to the VAR model. NA values are also removed from the merged time series.

#VAR_data <- ts.union(ts_train_meantemp, ts_train_windspeed)
VAR_data <- ts.union(ts_train_meantemp, ts_train_windspeed)
colnames(VAR_data) <- cbind("meantemp","wind_speed")
#v1 <- cbind(ts_week_train_meantemp, ts_week_train_windspeed)
#colnames(v1) <- cbind("meantemp","wind_speed")
#lagselect <- VARselect(v1, type = "both")
#lagselect$selection
VAR_data <- na.remove(VAR_data)
#tail(v1)

We look at different lags suggested by different criteria if we use VAR model.

lagselect <- VARselect(VAR_data, season=12, type = "both")
lagselect$selection
## AIC(n)  HQ(n)  SC(n) FPE(n) 
##      8      6      4      8
lagselect$criteria
##                1         2         3         4         5         6         7
## AIC(n)  3.800121  3.779925  3.767371  3.743308  3.741000  3.731755  3.735946
## HQ(n)   3.840833  3.826065  3.818939  3.800305  3.803425  3.799609  3.809228
## SC(n)   3.909227  3.903578  3.905571  3.896056  3.908295  3.913598  3.932336
## FPE(n) 44.706670 43.812862 43.266294 42.237665 42.140335 41.752623 41.928049
##                8         9        10
## AIC(n)  3.730878  3.734485  3.733174
## HQ(n)   3.809587  3.818623  3.822740
## SC(n)   3.941815  3.959969  3.973205
## FPE(n) 41.716149 41.866995 41.812253

The results indicate that the lag with least AIC is 8. Now that we have merged the time series meantemp with wind_speed, we use VAR models on the merged time series Moreover, a cross-validated automatic selection of lags are performed by the model, with AIC as the chosen information criterion. Evidenced by the lag chosen by the model, it coincides with earlier discussion about lag. A seasonal lag was also included, with frequency chosen as 12, since monthly seasonalities were detected in seasonal plotsplots.

VAR_est <- vars::VAR(y = VAR_data, season=12, type="both", lag.max=50, ic="AIC")
VAR_est
## 
## VAR Estimation Results:
## ======================= 
## 
## Estimated coefficients for equation meantemp: 
## ============================================= 
## Call:
## meantemp = meantemp.l1 + wind_speed.l1 + meantemp.l2 + wind_speed.l2 + meantemp.l3 + wind_speed.l3 + meantemp.l4 + wind_speed.l4 + meantemp.l5 + wind_speed.l5 + meantemp.l6 + wind_speed.l6 + meantemp.l7 + wind_speed.l7 + meantemp.l8 + wind_speed.l8 + const + trend + sd1 + sd2 + sd3 + sd4 + sd5 + sd6 + sd7 + sd8 + sd9 + sd10 + sd11 
## 
##   meantemp.l1 wind_speed.l1   meantemp.l2 wind_speed.l2   meantemp.l3 
##  7.810204e-01 -2.496899e-02  4.986317e-02  9.149127e-03 -4.009502e-02 
## wind_speed.l3   meantemp.l4 wind_speed.l4   meantemp.l5 wind_speed.l5 
##  1.706517e-02  9.045036e-02 -6.949475e-03  2.275118e-02  2.299705e-03 
##   meantemp.l6 wind_speed.l6   meantemp.l7 wind_speed.l7   meantemp.l8 
##  4.227566e-02  3.540833e-02 -1.979551e-02 -5.409594e-03  4.870198e-02 
## wind_speed.l8         const         trend           sd1           sd2 
##  2.050005e-02  3.798320e-01 -7.695056e-05  2.745496e-02  2.228299e-01 
##           sd3           sd4           sd5           sd6           sd7 
##  3.006144e-02  2.235889e-01  6.901423e-03 -1.667707e-01  1.187114e-01 
##           sd8           sd9          sd10          sd11 
## -3.222232e-01  6.379275e-02 -9.910410e-03  1.802931e-02 
## 
## 
## Estimated coefficients for equation wind_speed: 
## =============================================== 
## Call:
## wind_speed = meantemp.l1 + wind_speed.l1 + meantemp.l2 + wind_speed.l2 + meantemp.l3 + wind_speed.l3 + meantemp.l4 + wind_speed.l4 + meantemp.l5 + wind_speed.l5 + meantemp.l6 + wind_speed.l6 + meantemp.l7 + wind_speed.l7 + meantemp.l8 + wind_speed.l8 + const + trend + sd1 + sd2 + sd3 + sd4 + sd5 + sd6 + sd7 + sd8 + sd9 + sd10 + sd11 
## 
##   meantemp.l1 wind_speed.l1   meantemp.l2 wind_speed.l2   meantemp.l3 
##  0.2185127982  0.3632403988  0.0945264807 -0.0164792488 -0.0200543986 
## wind_speed.l3   meantemp.l4 wind_speed.l4   meantemp.l5 wind_speed.l5 
##  0.0388117282 -0.1484483807  0.0151262154  0.0664703411  0.0237790175 
##   meantemp.l6 wind_speed.l6   meantemp.l7 wind_speed.l7   meantemp.l8 
## -0.0593081937  0.0187228817  0.1027670239  0.0400212477 -0.1588412141 
## wind_speed.l8         const         trend           sd1           sd2 
## -0.0146664877  1.4001059674 -0.0003113605 -0.0905338166 -0.7781730908 
##           sd3           sd4           sd5           sd6           sd7 
## -0.3486024266 -0.9378478834 -0.5343602855 -0.4915621564 -0.2253789180 
##           sd8           sd9          sd10          sd11 
## -0.7973204283 -0.0678101552 -0.3747672248  0.5638669619
summary(VAR_est)
## 
## VAR Estimation Results:
## ========================= 
## Endogenous variables: meantemp, wind_speed 
## Deterministic variables: both 
## Sample size: 1454 
## Log Likelihood: -6785.997 
## Roots of the characteristic polynomial:
## 0.9908 0.7934 0.7137 0.7137 0.701 0.701 0.6888 0.6888 0.6739 0.6739 0.6713 0.6713 0.6326 0.6326 0.5766 0.5766
## Call:
## vars::VAR(y = VAR_data, type = "both", season = 12L, lag.max = 50, 
##     ic = "AIC")
## 
## 
## Estimation results for equation meantemp: 
## ========================================= 
## meantemp = meantemp.l1 + wind_speed.l1 + meantemp.l2 + wind_speed.l2 + meantemp.l3 + wind_speed.l3 + meantemp.l4 + wind_speed.l4 + meantemp.l5 + wind_speed.l5 + meantemp.l6 + wind_speed.l6 + meantemp.l7 + wind_speed.l7 + meantemp.l8 + wind_speed.l8 + const + trend + sd1 + sd2 + sd3 + sd4 + sd5 + sd6 + sd7 + sd8 + sd9 + sd10 + sd11 
## 
##                 Estimate Std. Error t value Pr(>|t|)    
## meantemp.l1    7.810e-01  2.646e-02  29.522  < 2e-16 ***
## wind_speed.l1 -2.497e-02  1.054e-02  -2.369  0.01799 *  
## meantemp.l2    4.986e-02  3.358e-02   1.485  0.13774    
## wind_speed.l2  9.149e-03  1.124e-02   0.814  0.41591    
## meantemp.l3   -4.010e-02  3.352e-02  -1.196  0.23187    
## wind_speed.l3  1.707e-02  1.124e-02   1.518  0.12913    
## meantemp.l4    9.045e-02  3.350e-02   2.700  0.00701 ** 
## wind_speed.l4 -6.949e-03  1.125e-02  -0.618  0.53680    
## meantemp.l5    2.275e-02  3.351e-02   0.679  0.49734    
## wind_speed.l5  2.300e-03  1.123e-02   0.205  0.83776    
## meantemp.l6    4.228e-02  3.349e-02   1.262  0.20702    
## wind_speed.l6  3.541e-02  1.123e-02   3.154  0.00164 ** 
## meantemp.l7   -1.980e-02  3.348e-02  -0.591  0.55444    
## wind_speed.l7 -5.410e-03  1.126e-02  -0.480  0.63113    
## meantemp.l8    4.870e-02  2.645e-02   1.842  0.06574 .  
## wind_speed.l8  2.050e-02  1.056e-02   1.941  0.05243 .  
## const          3.798e-01  1.707e-01   2.225  0.02624 *  
## trend         -7.695e-05  1.015e-04  -0.758  0.44859    
## sd1            2.745e-02  2.060e-01   0.133  0.89397    
## sd2            2.228e-01  2.053e-01   1.085  0.27800    
## sd3            3.006e-02  2.063e-01   0.146  0.88415    
## sd4            2.236e-01  2.057e-01   1.087  0.27722    
## sd5            6.901e-03  2.063e-01   0.033  0.97331    
## sd6           -1.668e-01  2.061e-01  -0.809  0.41859    
## sd7            1.187e-01  2.062e-01   0.576  0.56497    
## sd8           -3.222e-01  2.062e-01  -1.563  0.11831    
## sd9            6.379e-02  2.058e-01   0.310  0.75661    
## sd10          -9.910e-03  2.050e-01  -0.048  0.96145    
## sd11           1.803e-02  2.061e-01   0.087  0.93029    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## Residual standard error: 1.592 on 1425 degrees of freedom
## Multiple R-Squared: 0.9526,  Adjusted R-squared: 0.9517 
## F-statistic:  1023 on 28 and 1425 DF,  p-value: < 2.2e-16 
## 
## 
## Estimation results for equation wind_speed: 
## =========================================== 
## wind_speed = meantemp.l1 + wind_speed.l1 + meantemp.l2 + wind_speed.l2 + meantemp.l3 + wind_speed.l3 + meantemp.l4 + wind_speed.l4 + meantemp.l5 + wind_speed.l5 + meantemp.l6 + wind_speed.l6 + meantemp.l7 + wind_speed.l7 + meantemp.l8 + wind_speed.l8 + const + trend + sd1 + sd2 + sd3 + sd4 + sd5 + sd6 + sd7 + sd8 + sd9 + sd10 + sd11 
## 
##                 Estimate Std. Error t value Pr(>|t|)    
## meantemp.l1    0.2185128  0.0664778   3.287  0.00104 ** 
## wind_speed.l1  0.3632404  0.0264891  13.713  < 2e-16 ***
## meantemp.l2    0.0945265  0.0843707   1.120  0.26274    
## wind_speed.l2 -0.0164792  0.0282518  -0.583  0.55978    
## meantemp.l3   -0.0200544  0.0842364  -0.238  0.81186    
## wind_speed.l3  0.0388117  0.0282413   1.374  0.16957    
## meantemp.l4   -0.1484484  0.0841714  -1.764  0.07801 .  
## wind_speed.l4  0.0151262  0.0282657   0.535  0.59263    
## meantemp.l5    0.0664703  0.0842155   0.789  0.43007    
## wind_speed.l5  0.0237790  0.0282168   0.843  0.39952    
## meantemp.l6   -0.0593082  0.0841527  -0.705  0.48107    
## wind_speed.l6  0.0187229  0.0282077   0.664  0.50696    
## meantemp.l7    0.1027670  0.0841316   1.222  0.22210    
## wind_speed.l7  0.0400212  0.0283055   1.414  0.15761    
## meantemp.l8   -0.1588412  0.0664539  -2.390  0.01697 *  
## wind_speed.l8 -0.0146665  0.0265368  -0.553  0.58057    
## const          1.4001060  0.4289658   3.264  0.00112 ** 
## trend         -0.0003114  0.0002551  -1.221  0.22248    
## sd1           -0.0905338  0.5175245  -0.175  0.86115    
## sd2           -0.7781731  0.5159620  -1.508  0.13173    
## sd3           -0.3486024  0.5183317  -0.673  0.50134    
## sd4           -0.9378479  0.5168820  -1.814  0.06982 .  
## sd5           -0.5343603  0.5182769  -1.031  0.30270    
## sd6           -0.4915622  0.5179391  -0.949  0.34274    
## sd7           -0.2253789  0.5182375  -0.435  0.66370    
## sd8           -0.7973204  0.5180895  -1.539  0.12404    
## sd9           -0.0678102  0.5171166  -0.131  0.89569    
## sd10          -0.3747672  0.5152011  -0.727  0.46709    
## sd11           0.5638670  0.5177786   1.089  0.27633    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## Residual standard error: 4 on 1425 degrees of freedom
## Multiple R-Squared: 0.2465,  Adjusted R-squared: 0.2317 
## F-statistic: 16.65 on 28 and 1425 DF,  p-value: < 2.2e-16 
## 
## 
## 
## Covariance matrix of residuals:
##            meantemp wind_speed
## meantemp     2.5345     0.4036
## wind_speed   0.4036    16.0038
## 
## Correlation matrix of residuals:
##            meantemp wind_speed
## meantemp    1.00000    0.06337
## wind_speed  0.06337    1.00000
summary(VAR_est$varresult)
##            Length Class Mode
## meantemp   12     lm    list
## wind_speed 12     lm    list
#VAR_est |> diagnostics()

6.2.1 Evaluate

Based on the model summary, the value of metrics obtained by the model can be observed, for different lag:

lagselect$criteria
##                1         2         3         4         5         6         7
## AIC(n)  3.800121  3.779925  3.767371  3.743308  3.741000  3.731755  3.735946
## HQ(n)   3.840833  3.826065  3.818939  3.800305  3.803425  3.799609  3.809228
## SC(n)   3.909227  3.903578  3.905571  3.896056  3.908295  3.913598  3.932336
## FPE(n) 44.706670 43.812862 43.266294 42.237665 42.140335 41.752623 41.928049
##                8         9        10
## AIC(n)  3.730878  3.734485  3.733174
## HQ(n)   3.809587  3.818623  3.822740
## SC(n)   3.941815  3.959969  3.973205
## FPE(n) 41.716149 41.866995 41.812253

The optimal number was 8, with the following values:

lagselect$criteria[,8]
##    AIC(n)     HQ(n)     SC(n)    FPE(n) 
##  3.730878  3.809587  3.941815 41.716149

The \(R^2\) score of the model after applying can also be reported for both of the time series used:

VAR_meantemp_adjr <- summary(VAR_est$varresult$meantemp)$adj.r.squared
VAR_meantemp_adjr
## [1] 0.95168
VAR_windspeed_adjr <- summary(VAR_est$varresult$wind_speed)$adj.r.squared
VAR_windspeed_adjr
## [1] 0.231681

A Portmanteau test is provided to test that the residuals are uncorrelated.

serial.test(VAR_est, lags.pt=10, type="PT.asymptotic")
## 
##  Portmanteau Test (asymptotic)
## 
## data:  Residuals of VAR object VAR_est
## Chi-squared = 22.376, df = 8, p-value = 0.004266

The null hypothesis of no autocorrelation is rejected since the p-value is lower than the significance level of 0.05. This suggests the model is not performing as expected…

forecast <- VAR_est |> forecast(h=114)

Now test data is used to evaluate predictions:

predicted_meantemp <- as.numeric(forecast[2]$forecast$meantemp$mean)
actual_meantemp <- as.numeric(ts_test_meantemp)

predicted_windspeed <- as.numeric(forecast[2]$forecast$wind_speed$mean)
actual_winspeed <- as.numeric(ts_test_windspeed)
RMSE_meantemp_VAR <- rmse(predicted_meantemp, actual_meantemp)
RMSE_meantemp_VAR
## [1] 6.796025
RMSE_windspeed_VAR <- rmse(predicted_windspeed, actual_winspeed)
RMSE_windspeed_VAR
## [1] 4.772704
MAE_meantemp_VAR <- mae(predicted_meantemp, actual_meantemp)
MAE_meantemp_VAR
## [1] 5.066458
MAE_windspeed_VAR <- mae(predicted_windspeed, actual_winspeed)
MAE_windspeed_VAR
## [1] 3.739241
rsq <- function (x, y) cor(x, y) ^ 2
RSQ_meantemp_VAR <- rsq(predicted_meantemp, actual_meantemp)
RSQ_meantemp_VAR
## [1] 0.7067414
RSQ_windspeed_VAR <- rsq(predicted_windspeed, actual_winspeed)
RSQ_windspeed_VAR
## [1] 0.007727682

6.2.1.1 meantemp

d <- cbind(Adjusted_R2 = VAR_meantemp_adjr, AIC = lagselect$criteria[,10][0])
# at most 4 decimal places
knitr::kable(d, digits = 4)
Adjusted_R2
0.9517
d <- cbind(R2 = RSQ_meantemp_VAR, RMSE = RMSE_meantemp_VAR, MAE = MAE_meantemp_VAR)
# at most 4 decimal places
knitr::kable(d, digits = 4)
R2 RMSE MAE
0.7067 6.796 5.0665

6.2.1.2 windpseed

d <- cbind(Adjusted_R2 = VAR_meantemp_adjr, AIC = lagselect$criteria[,10][0])
# at most 4 decimal places
knitr::kable(d, digits = 4)
Adjusted_R2
0.9517
d <- cbind(R2 = RSQ_windspeed_VAR, RMSE = RMSE_windspeed_VAR, MAE = MAE_windspeed_VAR)
# at most 4 decimal places
knitr::kable(d, digits = 4)
R2 RMSE MAE
0.0077 4.7727 3.7392

6.2.2 Plot Forecast

Firstly, plot of forecasts are displayed, which are based on the model being trained on the training data.

plot(forecast)

forecast[2]$forecast$meantemp |> autoplot() + autolayer(ts_test_meantemp)

forecast[2]$forecast$wind_speed |>  autoplot() + autolayer(ts_test_windspeed)

secondly, prediction of test data alongside the actual test data is visualized

xts_temp <- xts(ts_test_meantemp, order.by=df_test$date, "%Y-%m-%d")
xts_temp_2 <- xts(forecast[2]$forecast$meantemp$mean, order.by=df_test$date, "%Y-%m-%d")
ts_plot(xts_temp, xts_temp_2)

xts_temp <- xts(ts_test_windspeed, order.by=df_test$date, "%Y-%m-%d")
xts_temp_2 <- xts(forecast[2]$forecast$wind_speed$mean, order.by=df_test$date, "%Y-%m-%d")
ts_plot(xts_temp, xts_temp_2)

6.2.3 Granger Causality

Granger causality is a statistical concept that is used to determine the causal relationship between two time series. The basic idea is that if past values of one time series (\(X\)) are useful in predicting the future values of another time series (\(Y\)), then \(X\) is said to Granger-cause \(Y\). This is typically tested using a statistical test such as a Granger causality test.

Granger_meantemp <- causality(VAR_est, cause = "meantemp")
Granger_meantemp
## $Granger
## 
##  Granger causality H0: meantemp do not Granger-cause wind_speed
## 
## data:  VAR object VAR_est
## F-Test = 7.624, df1 = 8, df2 = 2850, p-value = 3.869e-10
## 
## 
## $Instant
## 
##  H0: No instantaneous causality between: meantemp and wind_speed
## 
## data:  VAR object VAR_est
## Chi-squared = 5.815, df = 1, p-value = 0.01589
Granger_windspeed <- causality(VAR_est, cause = "wind_speed")
Granger_windspeed
## $Granger
## 
##  Granger causality H0: wind_speed do not Granger-cause meantemp
## 
## data:  VAR object VAR_est
## F-Test = 3.3039, df1 = 8, df2 = 2850, p-value = 0.0009195
## 
## 
## $Instant
## 
##  H0: No instantaneous causality between: wind_speed and meantemp
## 
## data:  VAR object VAR_est
## Chi-squared = 5.815, df = 1, p-value = 0.01589

Judging by the tests, there is no causality relationship between the two time series.

6.2.4 FEVD

Forecast error variance decomposition (FEVD) is a technique that decomposes the variance of the forecast error into the contributions from specific exogenous shocks.

Two practical applications is that this technique

  • Demonstrates how significant a shock is in explaining the variations of the variables in the model.

  • Demonstrates how that significance varies over time. For example, some shocks may not be responsible for variations in the short-run but may cause longer-term fluctuations.

FEVD1 <- fevd(VAR_est, n.ahead = 150)
FEVD1
## $meantemp
##         meantemp  wind_speed
##   [1,] 1.0000000 0.000000000
##   [2,] 0.9975612 0.002438818
##   [3,] 0.9969064 0.003093578
##   [4,] 0.9972451 0.002754862
##   [5,] 0.9975004 0.002499629
##   [6,] 0.9977037 0.002296278
##   [7,] 0.9956564 0.004343628
##   [8,] 0.9940442 0.005955783
##   [9,] 0.9904370 0.009562962
##  [10,] 0.9876182 0.012381849
##  [11,] 0.9855690 0.014431025
##  [12,] 0.9840666 0.015933357
##  [13,] 0.9826604 0.017339645
##  [14,] 0.9813003 0.018699716
##  [15,] 0.9799288 0.020071230
##  [16,] 0.9786037 0.021396349
##  [17,] 0.9773231 0.022676911
##  [18,] 0.9761321 0.023867890
##  [19,] 0.9750342 0.024965806
##  [20,] 0.9740204 0.025979647
##  [21,] 0.9730915 0.026908537
##  [22,] 0.9722269 0.027773120
##  [23,] 0.9714257 0.028574323
##  [24,] 0.9706798 0.029320197
##  [25,] 0.9699858 0.030014246
##  [26,] 0.9693378 0.030662202
##  [27,] 0.9687320 0.031267957
##  [28,] 0.9681646 0.031835436
##  [29,] 0.9676327 0.032367251
##  [30,] 0.9671337 0.032866292
##  [31,] 0.9666650 0.033334977
##  [32,] 0.9662242 0.033775839
##  [33,] 0.9658089 0.034191134
##  [34,] 0.9654171 0.034582924
##  [35,] 0.9650469 0.034953091
##  [36,] 0.9646967 0.035303272
##  [37,] 0.9643650 0.035634960
##  [38,] 0.9640505 0.035949481
##  [39,] 0.9637519 0.036248055
##  [40,] 0.9634682 0.036531792
##  [41,] 0.9631983 0.036801714
##  [42,] 0.9629412 0.037058751
##  [43,] 0.9626962 0.037303758
##  [44,] 0.9624625 0.037537513
##  [45,] 0.9622393 0.037760733
##  [46,] 0.9620259 0.037974075
##  [47,] 0.9618219 0.038178142
##  [48,] 0.9616265 0.038373492
##  [49,] 0.9614394 0.038560639
##  [50,] 0.9612599 0.038740056
##  [51,] 0.9610878 0.038912184
##  [52,] 0.9609226 0.039077428
##  [53,] 0.9607638 0.039236166
##  [54,] 0.9606113 0.039388749
##  [55,] 0.9604645 0.039535504
##  [56,] 0.9603233 0.039676734
##  [57,] 0.9601873 0.039812722
##  [58,] 0.9600563 0.039943732
##  [59,] 0.9599300 0.040070012
##  [60,] 0.9598082 0.040191793
##  [61,] 0.9596907 0.040309291
##  [62,] 0.9595773 0.040422710
##  [63,] 0.9594678 0.040532239
##  [64,] 0.9593619 0.040638057
##  [65,] 0.9592597 0.040740333
##  [66,] 0.9591608 0.040839226
##  [67,] 0.9590651 0.040934883
##  [68,] 0.9589726 0.041027447
##  [69,] 0.9588830 0.041117048
##  [70,] 0.9587962 0.041203814
##  [71,] 0.9587121 0.041287862
##  [72,] 0.9586307 0.041369305
##  [73,] 0.9585518 0.041448247
##  [74,] 0.9584752 0.041524791
##  [75,] 0.9584010 0.041599031
##  [76,] 0.9583289 0.041671057
##  [77,] 0.9582590 0.041740956
##  [78,] 0.9581912 0.041808808
##  [79,] 0.9581253 0.041874692
##  [80,] 0.9580613 0.041938681
##  [81,] 0.9579992 0.042000845
##  [82,] 0.9579387 0.042061251
##  [83,] 0.9578800 0.042119962
##  [84,] 0.9578230 0.042177040
##  [85,] 0.9577675 0.042232542
##  [86,] 0.9577135 0.042286524
##  [87,] 0.9576610 0.042339038
##  [88,] 0.9576099 0.042390135
##  [89,] 0.9575601 0.042439863
##  [90,] 0.9575117 0.042488268
##  [91,] 0.9574646 0.042535394
##  [92,] 0.9574187 0.042581284
##  [93,] 0.9573740 0.042625979
##  [94,] 0.9573305 0.042669516
##  [95,] 0.9572881 0.042711933
##  [96,] 0.9572467 0.042753266
##  [97,] 0.9572065 0.042793550
##  [98,] 0.9571672 0.042832816
##  [99,] 0.9571289 0.042871097
## [100,] 0.9570916 0.042908423
## [101,] 0.9570552 0.042944823
## [102,] 0.9570197 0.042980326
## [103,] 0.9569850 0.043014957
## [104,] 0.9569513 0.043048744
## [105,] 0.9569183 0.043081711
## [106,] 0.9568861 0.043113881
## [107,] 0.9568547 0.043145280
## [108,] 0.9568241 0.043175928
## [109,] 0.9567942 0.043205847
## [110,] 0.9567649 0.043235058
## [111,] 0.9567364 0.043263581
## [112,] 0.9567086 0.043291436
## [113,] 0.9566814 0.043318641
## [114,] 0.9566548 0.043345214
## [115,] 0.9566288 0.043371172
## [116,] 0.9566035 0.043396532
## [117,] 0.9565787 0.043421311
## [118,] 0.9565545 0.043445524
## [119,] 0.9565308 0.043469186
## [120,] 0.9565077 0.043492313
## [121,] 0.9564851 0.043514917
## [122,] 0.9564630 0.043537014
## [123,] 0.9564414 0.043558616
## [124,] 0.9564203 0.043579735
## [125,] 0.9563996 0.043600386
## [126,] 0.9563794 0.043620579
## [127,] 0.9563597 0.043640327
## [128,] 0.9563404 0.043659640
## [129,] 0.9563215 0.043678530
## [130,] 0.9563030 0.043697007
## [131,] 0.9562849 0.043715082
## [132,] 0.9562672 0.043732764
## [133,] 0.9562499 0.043750064
## [134,] 0.9562330 0.043766991
## [135,] 0.9562164 0.043783554
## [136,] 0.9562002 0.043799761
## [137,] 0.9561844 0.043815622
## [138,] 0.9561689 0.043831145
## [139,] 0.9561537 0.043846338
## [140,] 0.9561388 0.043861209
## [141,] 0.9561242 0.043875766
## [142,] 0.9561100 0.043890015
## [143,] 0.9560960 0.043903965
## [144,] 0.9560824 0.043917622
## [145,] 0.9560690 0.043930994
## [146,] 0.9560559 0.043944086
## [147,] 0.9560431 0.043956906
## [148,] 0.9560305 0.043969459
## [149,] 0.9560182 0.043981752
## [150,] 0.9560062 0.043993791
## 
## $wind_speed
##           meantemp wind_speed
##   [1,] 0.004015367  0.9959846
##   [2,] 0.014088186  0.9859118
##   [3,] 0.031345182  0.9686548
##   [4,] 0.045081879  0.9549181
##   [5,] 0.047250355  0.9527496
##   [6,] 0.050195175  0.9498048
##   [7,] 0.051791471  0.9482085
##   [8,] 0.058569722  0.9414303
##   [9,] 0.059755063  0.9402449
##  [10,] 0.060492777  0.9395072
##  [11,] 0.061230656  0.9387693
##  [12,] 0.062350161  0.9376498
##  [13,] 0.063537758  0.9364622
##  [14,] 0.064733138  0.9352669
##  [15,] 0.065784270  0.9342157
##  [16,] 0.066764878  0.9332351
##  [17,] 0.067666889  0.9323331
##  [18,] 0.068511469  0.9314885
##  [19,] 0.069312014  0.9306880
##  [20,] 0.070078972  0.9299210
##  [21,] 0.070843853  0.9291561
##  [22,] 0.071585299  0.9284147
##  [23,] 0.072310967  0.9276890
##  [24,] 0.073013577  0.9269864
##  [25,] 0.073699390  0.9263006
##  [26,] 0.074368938  0.9256311
##  [27,] 0.075024000  0.9249760
##  [28,] 0.075663715  0.9243363
##  [29,] 0.076289261  0.9237107
##  [30,] 0.076901062  0.9230989
##  [31,] 0.077500145  0.9224999
##  [32,] 0.078086898  0.9219131
##  [33,] 0.078661745  0.9213383
##  [34,] 0.079225064  0.9207749
##  [35,] 0.079777082  0.9202229
##  [36,] 0.080318110  0.9196819
##  [37,] 0.080848353  0.9191516
##  [38,] 0.081368096  0.9186319
##  [39,] 0.081877588  0.9181224
##  [40,] 0.082377090  0.9176229
##  [41,] 0.082866826  0.9171332
##  [42,] 0.083347014  0.9166530
##  [43,] 0.083817854  0.9161821
##  [44,] 0.084279549  0.9157205
##  [45,] 0.084732289  0.9152677
##  [46,] 0.085176264  0.9148237
##  [47,] 0.085611654  0.9143883
##  [48,] 0.086038638  0.9139614
##  [49,] 0.086457388  0.9135426
##  [50,] 0.086868073  0.9131319
##  [51,] 0.087270857  0.9127291
##  [52,] 0.087665900  0.9123341
##  [53,] 0.088053358  0.9119466
##  [54,] 0.088433385  0.9115666
##  [55,] 0.088806131  0.9111939
##  [56,] 0.089171740  0.9108283
##  [57,] 0.089530355  0.9104696
##  [58,] 0.089882117  0.9101179
##  [59,] 0.090227162  0.9097728
##  [60,] 0.090565624  0.9094344
##  [61,] 0.090897632  0.9091024
##  [62,] 0.091223316  0.9087767
##  [63,] 0.091542800  0.9084572
##  [64,] 0.091856206  0.9081438
##  [65,] 0.092163655  0.9078363
##  [66,] 0.092465265  0.9075347
##  [67,] 0.092761149  0.9072389
##  [68,] 0.093051420  0.9069486
##  [69,] 0.093336189  0.9066638
##  [70,] 0.093615564  0.9063844
##  [71,] 0.093889649  0.9061104
##  [72,] 0.094158548  0.9058415
##  [73,] 0.094422364  0.9055776
##  [74,] 0.094681194  0.9053188
##  [75,] 0.094935135  0.9050649
##  [76,] 0.095184284  0.9048157
##  [77,] 0.095428733  0.9045713
##  [78,] 0.095668573  0.9043314
##  [79,] 0.095903894  0.9040961
##  [80,] 0.096134783  0.9038652
##  [81,] 0.096361326  0.9036387
##  [82,] 0.096583608  0.9034164
##  [83,] 0.096801709  0.9031983
##  [84,] 0.097015712  0.9029843
##  [85,] 0.097225695  0.9027743
##  [86,] 0.097431735  0.9025683
##  [87,] 0.097633908  0.9023661
##  [88,] 0.097832290  0.9021677
##  [89,] 0.098026951  0.9019730
##  [90,] 0.098217965  0.9017820
##  [91,] 0.098405400  0.9015946
##  [92,] 0.098589326  0.9014107
##  [93,] 0.098769809  0.9012302
##  [94,] 0.098946916  0.9010531
##  [95,] 0.099120711  0.9008793
##  [96,] 0.099291257  0.9007087
##  [97,] 0.099458616  0.9005414
##  [98,] 0.099622849  0.9003772
##  [99,] 0.099784015  0.9002160
## [100,] 0.099942173  0.9000578
## [101,] 0.100097381  0.8999026
## [102,] 0.100249693  0.8997503
## [103,] 0.100399166  0.8996008
## [104,] 0.100545853  0.8994541
## [105,] 0.100689806  0.8993102
## [106,] 0.100831079  0.8991689
## [107,] 0.100969721  0.8990303
## [108,] 0.101105783  0.8988942
## [109,] 0.101239312  0.8987607
## [110,] 0.101370358  0.8986296
## [111,] 0.101498967  0.8985010
## [112,] 0.101625185  0.8983748
## [113,] 0.101749057  0.8982509
## [114,] 0.101870628  0.8981294
## [115,] 0.101989940  0.8980101
## [116,] 0.102107037  0.8978930
## [117,] 0.102221960  0.8977780
## [118,] 0.102334751  0.8976652
## [119,] 0.102445448  0.8975546
## [120,] 0.102554092  0.8974459
## [121,] 0.102660722  0.8973393
## [122,] 0.102765374  0.8972346
## [123,] 0.102868086  0.8971319
## [124,] 0.102968896  0.8970311
## [125,] 0.103067837  0.8969322
## [126,] 0.103164946  0.8968351
## [127,] 0.103260256  0.8967397
## [128,] 0.103353802  0.8966462
## [129,] 0.103445616  0.8965544
## [130,] 0.103535731  0.8964643
## [131,] 0.103624179  0.8963758
## [132,] 0.103710991  0.8962890
## [133,] 0.103796197  0.8962038
## [134,] 0.103879827  0.8961202
## [135,] 0.103961912  0.8960381
## [136,] 0.104042479  0.8959575
## [137,] 0.104121557  0.8958784
## [138,] 0.104199174  0.8958008
## [139,] 0.104275357  0.8957246
## [140,] 0.104350133  0.8956499
## [141,] 0.104423528  0.8955765
## [142,] 0.104495568  0.8955044
## [143,] 0.104566278  0.8954337
## [144,] 0.104635683  0.8953643
## [145,] 0.104703807  0.8952962
## [146,] 0.104770674  0.8952293
## [147,] 0.104836307  0.8951637
## [148,] 0.104900730  0.8950993
## [149,] 0.104963965  0.8950360
## [150,] 0.105026033  0.8949740
plot(FEVD1)

Evidenced by the plots, figure above implies that forecast error of the meantemp is attributed to itself. Same goes for wind_speed. However, when the VAR model is fitted on weekly aggregated data, then injecting shock in meantemp would also be influenced by wind_speed. This investigation was not added in code and plots, to keep the work concise.

6.3 Feedforward Neural Network

6.3.1 Forecast

6.3.1.1 wind_speed

nnetar function creates a feed-forward neural networks forecasting univariate time series. The default parameters are a single hidden layer and lagged inputs

set.seed(34)
# nnetar() requires a numeric vector or time series object as
# input ?nnetar() can be seen for more info on the function
# nnetar() by default fits multiple neural net models and
# gives averaged results xreg option allows for only numeric
# vectors in nnetar() function
fit_windspeed = nnetar(ts_train_windspeed)
fit_windspeed
## Series: ts_train_windspeed 
## Model:  NNAR(1,1,2)[365] 
## Call:   nnetar(y = ts_train_windspeed)
## 
## Average of 20 networks, each of which is
## a 2-2-1 network with 9 weights
## options were - linear output units 
## 
## sigma^2 estimated as 14.78
forecast_windspeed <- forecast(fit_windspeed, h = 114, PI = T)
#forecast_windspeed

6.3.1.2 meantemp

fit_meantemp = nnetar(ts_train_meantemp)
fit_meantemp
## Series: ts_train_meantemp 
## Model:  NNAR(11,1,6)[365] 
## Call:   nnetar(y = ts_train_meantemp)
## 
## Average of 20 networks, each of which is
## a 12-6-1 network with 85 weights
## options were - linear output units 
## 
## sigma^2 estimated as 1.884
forecast_meantemp <- forecast(fit_meantemp, h = 114, PI = T)
#forecast_meantemp

6.3.2 Evluate

6.3.2.1 meantemp

predicted <- as.numeric(forecast_meantemp$mean)
actual <- as.numeric(ts_test_meantemp)
RMSE_meantemp_NN <- rmse(predicted, actual)
RMSE_meantemp_NN
## [1] 3.134642
MAE_meantemp_NN <- mae(predicted, actual)
MAE_meantemp_NN
## [1] 2.456404
rsq <- function (x, y) cor(x, y) ^ 2

RSQ_meantemp_NN <- rsq(actual, predicted)
RSQ_meantemp_NN
## [1] 0.7723737

6.3.2.2 wind_speed

predicted <- as.numeric(forecast_windspeed$mean)
actual <- as.numeric(ts_test_windspeed)
RMSE_windspeed_NN <- rmse(predicted, actual)
RMSE_windspeed_NN
## [1] 3.544034
MAE_windspeed_NN <- mae(predicted, actual)
MAE_windspeed_NN
## [1] 2.761422
RSQ_windspeed_NN <- rsq(actual, predicted)
RSQ_windspeed_NN
## [1] 0.05910477

Now present tables of report both of time series.

6.3.2.3 meantemp

d <- cbind(R2 = RSQ_meantemp_NN, RMSE = RMSE_meantemp_NN, MAE = MAE_meantemp_NN)
# at most 4 decimal places
knitr::kable(d, digits = 4)
R2 RMSE MAE
0.7724 3.1346 2.4564

6.3.2.4 windpseed

d <- cbind(R2 = RSQ_windspeed_NN, RMSE = RMSE_windspeed_NN, MAE = MAE_windspeed_NN)
# at most 4 decimal places
knitr::kable(d, digits = 4)
R2 RMSE MAE
0.0591 3.544 2.7614

6.3.3 Plot Forecast

First we plot forecasts based on the model being trained on the training data.

forecast_windspeed |> autoplot() + autolayer(ts_test_windspeed)

forecast_meantemp |> autoplot() + autolayer(ts_test_meantemp)

Then, we plot the prediction of test data alongside the actual test data.

xts_temp <- xts(ts_test_meantemp, order.by=df_test$date, "%Y-%m-%d")
xts_temp_2 <- xts(forecast_meantemp$mean, order.by=df_test$date, "%Y-%m-%d")

ts_plot(xts_temp, xts_temp_2)

xts_temp <- xts(ts_test_windspeed, order.by=df_test$date, "%Y-%m-%d")
xts_temp_2 <- xts(forecast_windspeed$mean, order.by=df_test$date, "%Y-%m-%d")

ts_plot(xts_temp, xts_temp_2)

6.4 LSTM Neural Network

RNNs are prudent candidates among neural network models, as they are designed for modelling sequential data, of which time series is the quintessential example. Moreover, RNNs can effectively remember relevant long-term context from the input features. Remembering long-term context is not possible in multi-layer perceptrons (MLPs) as in their structure it is often presumed that all the inputs and outputs are independent of each other. the other hand, in RNN the behavior of hidden neurons might not just be determined by the activations in previous hidden layers, but also by the activations at earlier times. Furthermore, the activations of hidden and output neurons will not be determined just by the current input to the network, but also by earlier inputs.

The long-short term memory model is used in the following. The most effective sequence models used in practical applications are called gated RNNs (GRNNs). These include the long short-term memory and networks based on the gated recurrent unit. Generally, as RNNs are equipped with learning long-term dependencies, they have the issue of vanishing or exploding gradients. GRNNs are based on the idea of creating paths through time that have derivatives that neither vanish nor explode, which was made possible by including connection weights that may alter at each time-step. This allows them to tackle the challenge of vanishing or exploding gradients. Once that information has been used, however, it might be useful for the neural network to forget the old state. The intuition is that although remembering past state of the network (either earlier inputs or outputs) is useful, but not all states are useful, hence it would be an intelligent and more human-wise behavior that the model decides when to clear the past state, which is what GRNNs do.

The difference between a GRNN and LSTM is that GRNNs have input gate and output gate, while LSTM has an additional forget gate.

Only for this model, the data is normalized with standard scaler. After that, the data is altered to have the required format for the LSTM model implemented using Keras library.

get_scaling_factors <- function(data){
  out <- c(mean = mean(data), sd = sd(data))
  return(out)
}

normalize_data <- function(data, scaling_factors, reverse = FALSE) {
  
  if (reverse) temp <- (data * scaling_factors[2]) + scaling_factors[1]
  else temp <- (data - scaling_factors[1]) / scaling_factors[2]
  
  out <- temp %>% as.matrix()
  return(out)
}


kerasize_data <- function(data, x = TRUE, lag = 114, pred = 114) {
  
  if (x) {
    
    temp <- sapply(
      1:(length(data) - lag - pred + 1)
      ,function(x) data[x:(x + lag - 1), 1]
    ) %>% t()
    
    out <- array(
      temp %>% unlist() %>% as.numeric()
      ,dim = c(nrow(temp), lag, 1)
    )
    
  }  else {
    
    temp <- sapply(
      (1 + lag):(length(data) - pred + 1)
      ,function(x) data[x:(x + lag - 1), 1]
    ) %>% t()
    
    out <- array(
      temp %>% unlist() %>% as.numeric()
      ,dim = c(nrow(temp), pred, 1)
    )
    
  }
  
  return(out)
  
}

kerasize_pred_input <- function(data, lag = 114, pred = 114){
  temp <- data[(length(data) - pred + 1):length(data)]
  temp <- normalize_data(temp, get_scaling_factors(data))
  out <- array(temp, c(1, lag, 1))
  return(out)
}
lstm_build_model <- function(x, y, units = 128, batch = 1, epochs = 20, rate = 0.2, seed = 2137){
  
  lag = dim(x)[2]
  
  lstm_model <- keras_model_sequential()

  lstm_model %>%
    layer_lstm(units = 128
               ,batch_input_shape = c(batch, lag, 1)
               ,return_sequences = TRUE
               ,stateful = TRUE) %>%
    layer_dropout(rate = rate) %>%
    layer_lstm(units = 64
               ,return_sequences = TRUE
               ,stateful = TRUE) %>%
    layer_dropout(rate = rate) %>%
    time_distributed(layer_dense(units = 1))

  lstm_model %>%
    compile(loss = 'mean_squared_error'
            ,optimizer = 'adam'
            ,metrics = 'accuracy')

  tensorflow::set_random_seed(seed)
  lstm_model %>% fit(
    x = x
    ,y = y
    ,batch_size = batch
    ,epochs = epochs
    ,verbose = 0
    ,shuffle = FALSE)
  
  out <- list(
    model = lstm_model
    ,x = x
    ,batch = batch
    ,lag = lag
    ,pred = dim(y)[2]
  )
  return(out)

}
lstm_forecast <- function(x_test, model, scaling_factors){
  
  batch <- model$batch
  
  temp <- model$model %>%
    predict(x_test, batch_size = batch) %>% 
    .[, , 1] %>%
    normalize_data(scaling_factors = scaling_factors, reverse = TRUE)
  
  out <- list(
    forecast = temp
    ,scaling_factors = scaling_factors
  )
  
  return(out)
  
}

6.4.1 Forecast

6.4.1.1 meantemp

# remove the first row of df_test, as it is common with df_train
#df_test <- df_test[-1,] |> head(1)
#data <- as.data.frame(rbind(df_train, df_test))
#data <- merge(df_train, df_test, all=TRUE)

data_meantemp <- ts(c(ts_train_meantemp, ts_test_meantemp), 
           start = start(ts_train_meantemp), 
           frequency = frequency(ts_test_meantemp)) 
scaling_factors <- get_scaling_factors(data_meantemp)
data_meantemp_norm <- normalize_data(data_meantemp, scaling_factors)

x_data <- kerasize_data(data_meantemp_norm, x = TRUE, lag = 114, pred = 114)
y_data <- kerasize_data(data_meantemp_norm, x = FALSE, lag = 114, pred = 114)
x_test <- kerasize_pred_input(data_meantemp_norm, lag = 114, pred = 114)
model <- lstm_build_model(x_data, y_data)
prediction_meantemp <- lstm_forecast(x_test, model, scaling_factors)

6.4.1.2 windspeed

data_windspeed <- ts(c(ts_train_windspeed, ts_test_windspeed), 
           start = start(ts_train_windspeed), 
           frequency = frequency(ts_test_windspeed))    
scaling_factors <- get_scaling_factors(data_windspeed)
data_windspeed_norm <- normalize_data(data_windspeed, scaling_factors)

x_data <- kerasize_data(data_windspeed_norm, x = TRUE, lag = 114, pred = 114)
y_data <- kerasize_data(data_windspeed_norm, x = FALSE, lag = 114, pred = 114)
x_test <- kerasize_pred_input(data_windspeed_norm, lag = 114, pred = 114)
model <- lstm_build_model(x_data, y_data)
prediction_windspeed <- lstm_forecast(x_test, model, scaling_factors)

6.4.2 Evaluate

6.4.2.1 meantemp

predicted_meantemp <- as.numeric(prediction_meantemp$forecast)
actual_meantemp <- as.numeric(ts_test_meantemp)
RMSE_meantemp_LSTM <- rmse(predicted_meantemp, actual_meantemp)
MAE_meantemp_LSTM <- mae(predicted_meantemp, actual_meantemp)
RSQ_meantemp_LSTM <- rsq(predicted_meantemp, actual_meantemp)

6.4.2.2 windspeed

predicted_windspeed <- as.numeric(prediction_windspeed$forecast)
actual_windspeed <- as.numeric(ts_test_windspeed)
RMSE_windspeed_LSTM <- rmse(predicted_windspeed, actual_windspeed)
MAE_windspeed_LSTM <- mae(predicted_windspeed, actual_windspeed)
RSQ_windspeed_LSTM <- rsq(predicted_windspeed, actual_windspeed)

6.4.3 Plot Forecast

6.4.3.1 meantemp

xts_temp <- xts(ts_test_meantemp, 
                order.by=df_test$date, 
                "%Y-%m-%d")
xts_temp_2 <- xts(prediction_meantemp$forecast, 
                  order.by=df_test$date, 
                  "%Y-%m-%d")

ts_plot(xts_temp, xts_temp_2)

6.4.3.2 windspeed

xts_temp <- xts(ts_test_windspeed, 
                order.by=df_test$date, 
                "%Y-%m-%d")
xts_temp_2 <- xts(prediction_windspeed$forecast, 
                  order.by=df_test$date, 
                  "%Y-%m-%d")

ts_plot(xts_temp, xts_temp_2)

6.5 Compare Metrics

metrics_list <- c("RMSE", "MAE")
ARMA_meantemp <- c(RMSE_ARMA, MAE_ARMA)

VAR_meantemp <- c(RMSE_meantemp_VAR, MAE_meantemp_VAR)
VAR_windspeed <- c(RMSE_windspeed_VAR, MAE_windspeed_VAR)

NN_meantemp <- c(RMSE_meantemp_NN, MAE_meantemp_NN)
NN_windspeed <- c(RMSE_windspeed_NN, MAE_windspeed_NN)

LSTM_meantemp <- c(RMSE_meantemp_LSTM,
                   MAE_meantemp_LSTM)

LSTM_windspeed <- c(RMSE_windspeed_LSTM,
                   MAE_windspeed_LSTM)

df_eval <- data.frame(metrics_list, ARMA_meantemp, VAR_meantemp, VAR_windspeed, NN_meantemp, NN_windspeed, LSTM_meantemp, LSTM_windspeed)

df_eval[, sapply(df_eval, is.numeric)] <- apply(df_eval[, sapply(df_eval, is.numeric)], 2, round, 3)

colnames(df_eval) <- lapply(colnames(df_eval), function(x) gsub("^.*_", "", x))
#table(df_eval) |> htmlTable
#knitr::kable(df_eval, col.names = names(df_eval))
#df_eval
htmlTable(df_eval,
          digits = 3,
          cgroup = c("Metrics","ARMA","VAR","NN", "LSTM"),
          n.cgroup = c(1,1,2,2,2),
          
)
Metrics   ARMA   VAR   NN   LSTM
list   meantemp   meantemp windspeed   meantemp windspeed   meantemp windspeed
1 RMSE   4.299   6.796 4.773   3.135 3.544   11.336 4.672
2 MAE   3.576   5.066 3.739   2.456 2.761   10.744 3.738

7 Conclusion

  1. Among all the models, the feedforward neural network outperformed the other models.
  2. Based on Granger causality tests, there is no causal relationship between meantemp and wind_speed time series, i.e., between mean temperature and wind speed of Delhi, India.
  3. Injecting a shock in meantemp would be only affected by variations of the meantemp time series. Similarly, injecting a shock in wind_speed would also be only affected by variations of wind_speed. However, weekly aggregated data would leave footprint of wind_speed when injecting shock in meantemp.